{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "df870754-c9b3-400f-b2ad-6520d4cf0dff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nNaive Bayes multi-class classifier inplemented from scratch.\\nHandles zero frequency corrections/smoothing.\\n'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "Naive Bayes multi-class classifier inplemented from scratch.\n",
    "Handles zero frequency corrections/smoothing.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "35404177-1909-4745-8d83-c74c15eebb97",
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import numpy"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2105c758-c370-4352-9633-bafbe0057730",
   "metadata": {},
   "source": [
    "### 1. Calculating prior probabilities\n",
    "Priors are the probabilities of seeing the classifications from just the labeled data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1cd21916-7022-43c9-8418-ef69b068869c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helpers (Priors) ========================================\n",
    "\n",
    "def calculate_frequency_average(series):\n",
    "    \"\"\" Calculates probabilites of occurences of each label out of the entire set \"\"\"\n",
    "    try:\n",
    "        series_averages = series.value_counts() / len(series)\n",
    "        return series_averages.to_dict()\n",
    "    except ZeroDivisionError as exception:\n",
    "        raise exception"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a23754d-630e-4181-9586-7d99be584646",
   "metadata": {},
   "source": [
    "### 2. Training a Naive Bayes model\n",
    "We need to calculate the text's word frequencies in order to train the model. Our plan is to write a dictionary that records every word, and calculate its pair of occurrences in spam and ham. \n",
    "\n",
    "Sometimes, if we train on new text, we may see a word that we haven't seen before. In order for the math to check out (avoid dividing by zero), we may have to add a tiny number, and we'll use the whole text to cook up this number. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d3161f72-140e-407e-8cf7-8fc45b460de0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helpers (Dictionaries) =========================================\n",
    "\n",
    "def merge_dicts(dict1, dict2):\n",
    "    dictionary = dict1.copy()\n",
    "    dictionary.update(dict2)\n",
    "    return dictionary\n",
    "\n",
    "def construct_frequency_dict_from_series(series_text):\n",
    "    \"\"\" Converts text to lower-case then returns dictionary of word counts \"\"\"\n",
    "    series_counts = (\n",
    "        series_text\n",
    "        .str.lower()\n",
    "        .str.split() # string -> list\n",
    "        .explode() # produces row for each item in list\n",
    "        .value_counts()\n",
    "    )\n",
    "    \n",
    "    return series_counts.to_dict()\n",
    "\n",
    "def construct_frequency_dict_from_strings(list_strings):\n",
    "    \"\"\" Converts text to lower-case then returns list of unique words \"\"\"\n",
    "    string = \" \".join(list_strings)\n",
    "    string = string.lower()\n",
    "    list_words = string.split()\n",
    "\n",
    "    return dict(collections.Counter(list_words))\n",
    "\n",
    "# Model =========================================\n",
    "\n",
    "def calculate_labeled_frequencies(dict_frequencies_text, dataframe, column_label, column_words):\n",
    "    \"\"\" \n",
    "    Constructs a frequency dictionary for list of words.\n",
    "    Uses a processed dataframe with list words column.\n",
    "    \n",
    "    Handles zero frequency occurences by adding n(w) / total,\n",
    "    in which n(w) is the number of occurences of the word\n",
    "    across all text, and total is the total number of words\n",
    "    in the text.\n",
    "    \n",
    "    Parameter\n",
    "    ----------\n",
    "    dict_frequencies_text = { word : n(word) }\n",
    "    dataframe[column_words] = series of list of strings\n",
    "\n",
    "    \"\"\"\n",
    "    list_labels = dataframe[column_label].unique()\n",
    "    total = sum(dict_frequencies_text.values())\n",
    "    \n",
    "    # Doing it this way avoids copy errors with nested dictionaries\n",
    "    model = {}\n",
    "    for word in dict_frequencies_text.keys():\n",
    "        model.setdefault(word, {label : 0 for label in list_labels })\n",
    "\n",
    "    # Split label column into groups so we can count them directly\n",
    "    group_labels = dataframe.groupby(column_label)\n",
    "    for label, label_df in group_labels:\n",
    "        for list_words in label_df[column_words]:\n",
    "            for word in list_words:\n",
    "                model[word][label] += 1\n",
    "\n",
    "    # Handles the zero frequency offset\n",
    "    for word, dict_frequency in model.items():\n",
    "        offset = max(dict_frequencies_text[word] / total, 1E-8)\n",
    "        for key in dict_frequency.keys():\n",
    "            dict_frequency[key] += offset\n",
    "\n",
    "    return model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "297942c9-8ea9-4a40-9b8d-588235cadf3f",
   "metadata": {},
   "source": [
    "### 3. Using the model to make predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d87eff2b-fc17-4758-9a5b-640b0ae69a8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_bayes(word, label, dict_frequencies):\n",
    "    \"\"\" \n",
    "    Doesn't use the naive assumption.\n",
    "    likelihood = (num labeled text with word) / sum(num labeled text with word for all labels)\n",
    "               = P(A | Event_j) / sum ( P (A | Event_i) )\n",
    "    \"\"\"\n",
    "    label_count = dict_frequencies[word][label]\n",
    "    all_label_counts = sum(dict_frequencies[word].values())\n",
    "\n",
    "    try:\n",
    "        return label_count / all_label_counts\n",
    "    except ZeroDivisionError:\n",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0f63d73f-3cc5-4866-904f-de6aac50578b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Helpers (Probabilities) =============================\n",
    "\n",
    "def calculate_list_product(list_):\n",
    "    \"\"\" Slightly faster to work on arrays than directly with list \"\"\"\n",
    "    return numpy.array(list_).prod()\n",
    "\n",
    "# Naive Bayes Classifier ==============================\n",
    "\n",
    "def setup_naive_bayes(dict_frequencies_new_text, dataframe, column_label, column_words, column_text):\n",
    "    \"\"\" \n",
    "    Adds new text to the model so it can be used to make new predictions.\n",
    "    This is mostly just an accumulation of the previous cells.\n",
    "    \"\"\"\n",
    "    \n",
    "    dict_frequencies_whole_text = construct_frequency_dict_from_series(dataframe[column_text])\n",
    "\n",
    "    dict_model = calculate_labeled_frequencies(\n",
    "        merge_dicts(dict_frequencies_whole_text, dict_frequencies_new_text), \n",
    "        dataframe, \n",
    "        column_label, \n",
    "        column_words)\n",
    "\n",
    "    return dict_model\n",
    "\n",
    "def calculate_naive_bayes(list_words, dict_frequencies, series_labels):\n",
    "    list_labels = series_labels.unique()\n",
    "    counts_label = series_labels.value_counts()\n",
    "    total = len(series_labels)\n",
    "    \n",
    "    # Calculate the probabilty of given label for each word, then accumulates.\n",
    "    dict_naive_bayes = { label : 1 for label in list_labels }\n",
    "    for word in list_words:\n",
    "        for label in list_labels:\n",
    "            probability = dict_frequencies[word][label] / counts_label[label]            \n",
    "            if probability == 0:\n",
    "                print(word)\n",
    "            dict_naive_bayes[label] *= (probability * total)\n",
    "\n",
    "    # Multiply by the total number of elements for each label\n",
    "    for label in list_labels:\n",
    "        dict_naive_bayes[label] *= counts_label[label]\n",
    "    \n",
    "    return dict_naive_bayes\n",
    "\n",
    "def predict_naive_bayes(document, label_to_predict, dict_frequencies, series_labels):\n",
    "    \"\"\" Uses the naive assumption to predict on a given document \"\"\"\n",
    "\n",
    "    # words\n",
    "    document = document.lower()\n",
    "    words = set(document.split())\n",
    "\n",
    "    dict_naive_bayes = calculate_naive_bayes(words, dict_frequencies, series_labels)\n",
    "    \n",
    "    numerator = dict_naive_bayes[label_to_predict]\n",
    "    denominator = sum(dict_naive_bayes.values())\n",
    "\n",
    "    return numerator/denominator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a333eb42-e9b7-4972-a640-fa8e509f87cd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
